Model:
ID and DAY_IDEDA:
TARGET-vars correlationlibrary(readr)
library(dplyr)
##
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
##
## filter, lag
## Les objets suivants sont masqués depuis 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
library(Hmisc)
##
## Attachement du package : 'Hmisc'
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## src, summarize
## Les objets suivants sont masqués depuis 'package:base':
##
## format.pval, units
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(PerformanceAnalytics)
## Le chargement a nécessité le package : xts
## Le chargement a nécessité le package : zoo
##
## Attachement du package : 'zoo'
## Les objets suivants sont masqués depuis 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attachement du package : 'xts'
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## first, last
##
## Attachement du package : 'PerformanceAnalytics'
## L'objet suivant est masqué depuis 'package:graphics':
##
## legend
X_train_raw <- read.csv("../data/X_train.csv")
X_test_raw <- read.csv("../data/X_test.csv")
y_train <- read.csv("../data/y_train.csv")
Shape of the data sets:
print("X train:", )
## [1] "X train:"
dim(X_train_raw)
## [1] 1494 35
print("X test:", )
## [1] "X test:"
dim(X_test_raw)
## [1] 654 35
print("y train:", )
## [1] "y train:"
dim(y_train)
## [1] 1494 2
Are samples’ IDs in X and y aligned? Number of misaligned rows:
sum(X_train_raw$ID != y_train$ID)
## [1] 0
Yes, they are.
Missing values by column. Training set:
tibble_ <- X_train_raw
na_cols_train <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
na_cols_train
Test set:
tibble_ <- X_test_raw
na_cols_test <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
na_cols_test
Are cols with NAs the same in training and test sets?
print("In train set only:")
## [1] "In train set only:"
print(setdiff(na_cols_train$column_names, na_cols_test$column_names))
## character(0)
print("In test set only:")
## [1] "In test set only:"
print(setdiff(na_cols_test$column_names, na_cols_train$column_names))
## character(0)
Columns’ dtypes. The dtypes of all columns have been properly detected on the data import.
str(X_train_raw)
## 'data.frame': 1494 obs. of 35 variables:
## $ ID : int 1054 2049 1924 297 1101 1520 1546 1069 1323 1618 ...
## $ DAY_ID : int 206 501 687 720 818 467 144 1136 83 307 ...
## $ COUNTRY : chr "FR" "FR" "FR" "DE" ...
## $ DE_CONSUMPTION : num 0.2101 -0.0224 1.395 -0.9833 0.1438 ...
## $ FR_CONSUMPTION : num -0.427 -1.003 1.979 -0.849 -0.617 ...
## $ DE_FR_EXCHANGE : num -0.6065 -0.0221 1.0213 -0.8396 -0.925 ...
## $ FR_DE_EXCHANGE : num 0.6065 0.0221 -1.0213 0.8396 0.925 ...
## $ DE_NET_EXPORT : num NA -0.574 -0.622 -0.271 NA ...
## $ FR_NET_EXPORT : num 0.693 -1.131 -1.683 0.563 0.99 ...
## $ DE_NET_IMPORT : num NA 0.574 0.622 0.271 NA ...
## $ FR_NET_IMPORT : num -0.693 1.131 1.683 -0.563 -0.99 ...
## $ DE_GAS : num 0.441 0.175 2.352 0.488 0.239 ...
## $ FR_GAS : num -0.214 0.427 2.122 0.195 -0.241 ...
## $ DE_COAL : num 0.741 -0.17 1.572 -1.474 1.004 ...
## $ FR_COAL : num 0.289 -0.762 0.777 -0.786 -0.275 ...
## $ DE_HYDRO : num 2.209 0.188 -0.109 -0.368 -0.23 ...
## $ FR_HYDRO : num 0.208 -0.807 0.779 1.32 -0.796 ...
## $ DE_NUCLEAR : num 0.70961 -1.88274 -1.89711 -0.20555 -0.00558 ...
## $ FR_NUCLEAR : num -0.19 -2.186 0.735 -1.59 0.177 ...
## $ DE_SOLAR : num 0.102 1.987 -1.116 1.752 0.694 ...
## $ FR_SOLAR : num 1.249 3.237 -0.371 0.563 0.724 ...
## $ DE_WINDPOW : num -0.5734 -0.0355 -0.2988 -0.0101 -0.7749 ...
## $ FR_WINDPOW : num -0.269 -0.107 -0.141 0.367 -0.564 ...
## $ DE_LIGNITE : num 0.87 -0.194 0.428 -2.331 0.691 ...
## $ DE_RESIDUAL_LOAD: num 0.627 -0.395 1.337 -1.192 0.572 ...
## $ FR_RESIDUAL_LOAD: num -0.445 -1.183 1.947 -0.977 -0.526 ...
## $ DE_RAIN : num -0.173 -1.24 -0.481 -1.115 -0.541 ...
## $ FR_RAIN : num -0.556 -0.77 -0.313 -0.508 -0.425 ...
## $ DE_WIND : num -0.791 1.522 0.431 -0.499 -1.088 ...
## $ FR_WIND : num -0.283 0.828 0.488 -0.236 -1.012 ...
## $ DE_TEMP : num -1.069 0.437 0.685 0.351 0.614 ...
## $ FR_TEMP : num -0.0634 1.8312 0.1148 -0.4175 0.7295 ...
## $ GAS_RET : num 0.339 -0.659 0.536 0.912 0.245 ...
## $ COAL_RET : num 0.1246 0.0471 0.7433 -0.2962 1.5266 ...
## $ CARBON_RET : num -0.00244 -0.49037 0.20495 1.07395 2.61438 ...
COUNTRY var and country label prefixValue counts in the COUNTRY column:
X_train_raw %>%
count(COUNTRY) %>%
mutate(percentage = round(n / sum(n) * 100, 1))
The data describes two countries, FR and DE, roughly equally presented.
Duplicates among DAY_ID in the training set:
X_train_raw %>%
count(DAY_ID) %>%
mutate(n = n - 1) %>%
rename(num_of_duplicates = n) %>%
count(num_of_duplicates) %>%
rename(count = n) %>%
mutate(percentage = round(count / sum(count) * 100, 1))
Most of the lines (3/4) have duplicates.
643*2 + 208 gives the number of rows in the training set.
Duplicates among DAY_ID in the test set:
X_test_raw %>%
count(DAY_ID) %>%
mutate(n = n - 1) %>%
rename(num_of_duplicates = n) %>%
count(num_of_duplicates) %>%
rename(count = n) %>%
mutate(percentage = round(count / sum(count) * 100, 1))
The number of duplicates is more or less the same as in the training set.
For the rows that have the same DAY_ID, in which columns
are they different? Training set:
tibble_ <- X_train_raw
tibble_$TARGET <- y_train$TARGET
#' Given a tibble with two rows and any number of columns, identify columns that
#' have different values in two rows.
#'
#' This function takes a tibble with dimensions [2, any] as input and returns a
#' list of column names.
#'
#' @param day_id_ Tibble with dimensions [2, any].
#' @return List of column names.
non_duplicated_cols <- function(day_id_) {
col_names <- tibble_ %>%
filter(DAY_ID == day_id_) %>%
select_if(function(column) any(!identical(column[1], column[2]))) %>%
names()
return(col_names)
}
# get the list of day IDs that are duplicated in the table
ids <- tibble_ %>%
count(DAY_ID) %>%
filter(n > 1) %>%
pull(DAY_ID)
# for each pair of duplicated rows, get a nested list of column names that have
# different values in the pair of duplicated rows
results <- lapply(ids, non_duplicated_cols)
# flatten the nested list, get unique values
results <- unique(unlist(results, recursive = FALSE))
results
## [1] "ID" "COUNTRY" "TARGET"
Same question for the test set (no TARGET):
tibble_ <- X_test_raw
#' Given a tibble with two rows and any number of columns, identify columns that
#' have different values in two rows.
#'
#' This function takes a tibble with dimensions [2, any] as input and returns a
#' list of column names.
#'
#' @param day_id_ Tibble with dimensions [2, any].
#' @return List of column names.
non_duplicated_cols <- function(day_id_) {
col_names <- tibble_ %>%
filter(DAY_ID == day_id_) %>%
select_if(function(column) any(!identical(column[1], column[2]))) %>%
names()
return(col_names)
}
# get the list of day IDs that are duplicated in the table
ids <- tibble_ %>%
count(DAY_ID) %>%
filter(n > 1) %>%
pull(DAY_ID)
# for each pair of duplicated rows, get a nested list of column names that have
# different values in the pair of duplicated rows
results <- lapply(ids, non_duplicated_cols)
# flatten the nested list, get unique values
results <- unique(unlist(results, recursive = FALSE))
results
## [1] "ID" "COUNTRY"
Thus the rows having the same value of DAY_ID differ
only by ID (obviously), COUNTRY (FR or DE) and
TARGET (prediction for FR or DE). For the rest they are
identical.
DAY_ID
duplicates.First select them:
# get the list of day IDs that are not duplicated in the training set
ids_train <- X_train_raw %>%
count(DAY_ID) %>%
filter(n == 1) %>%
pull(DAY_ID)
# get the list of day IDs that are not duplicated in the test set
ids_test <- X_test_raw %>%
count(DAY_ID) %>%
filter(n == 1) %>%
pull(DAY_ID)
To which countries do correspond these lines?
print(X_train_raw %>% filter(DAY_ID %in% ids_train) %>% count(COUNTRY))
## COUNTRY n
## 1 FR 208
print(X_test_raw %>% filter(DAY_ID %in% ids_test) %>% count(COUNTRY))
## COUNTRY n
## 1 FR 76
All rows that don’t have a duplicate in DAY_ID
correspond to country FR.
What about missing vals in these non duplicated lines?
tibble_ <- X_train_raw %>% filter(DAY_ID %in% ids_train)
na_cols_train <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_train)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_NET_EXPORT 59.6
## 2 DE_NET_IMPORT 59.6
## 3 FR_NET_EXPORT 33.7
## 4 FR_NET_IMPORT 33.7
## 5 DE_FR_EXCHANGE 12
## 6 FR_DE_EXCHANGE 12
tibble_ <- X_test_raw %>% filter(DAY_ID %in% ids_test)
na_cols_test <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_test)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_NET_EXPORT 61.8
## 2 DE_NET_IMPORT 61.8
## 3 FR_NET_EXPORT 31.6
## 4 FR_NET_IMPORT 31.6
## 5 DE_FR_EXCHANGE 11.8
## 6 FR_DE_EXCHANGE 11.8
To conclude, the lines that don’t have duplicates in
DAY_ID have NAs in the cols describing countries’ net
import and export and DE<->FR exchange
(DE_FR_EXCHANGE, FR_DE_EXCHANGE,
DE_NET_EXPORT, FR_NET_EXPORT,
DE_NET_IMPORT, FR_NET_IMPORT).
And in duplicated lines?
tibble_ <- X_train_raw %>% filter(!(DAY_ID %in% ids_train))
na_cols_train <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_train)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_RAIN 7.3
## 2 FR_RAIN 7.3
## 3 DE_WIND 7.3
## 4 FR_WIND 7.3
## 5 DE_TEMP 7.3
## 6 FR_TEMP 7.3
tibble_ <- X_test_raw %>% filter(!(DAY_ID %in% ids_test))
na_cols_test <- tibble(
column_names = names(tibble_),
missing_percentage = round(colMeans(is.na(tibble_)) * 100, 1)
) %>%
arrange(desc(missing_percentage)) %>%
filter(missing_percentage > 0)
print(na_cols_test)
## # A tibble: 6 × 2
## column_names missing_percentage
## <chr> <dbl>
## 1 DE_RAIN 6.9
## 2 FR_RAIN 6.9
## 3 DE_WIND 6.9
## 4 FR_WIND 6.9
## 5 DE_TEMP 6.9
## 6 FR_TEMP 6.9
The lines having duplicates in DAY_ID have NAs only in
the cols describing countries’ weather conditions (DE_RAIN,
FR_RAIN, DE_WIND, FR_WIND,
DE_TEMP, FR_TEMP).
The two lists of columns above span all (12) the columns with missing values.
Find the groups of columns that have simultaneously NAs in one row for the training set:
missing_columns_list <- X_train_raw %>%
pmap(function(...) names(list(...))[sapply(list(...), is.na)]) %>%
Filter(function(x) length(x) > 0, .) %>%
unique(.)
missing_columns_list
## [[1]]
## [1] "DE_NET_EXPORT" "DE_NET_IMPORT"
##
## [[2]]
## [1] "DE_FR_EXCHANGE" "FR_DE_EXCHANGE" "DE_NET_EXPORT" "FR_NET_EXPORT"
## [5] "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[3]]
## [1] "DE_NET_EXPORT" "FR_NET_EXPORT" "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[4]]
## [1] "DE_RAIN" "FR_RAIN" "DE_WIND" "FR_WIND" "DE_TEMP" "FR_TEMP"
Same for the test set:
X_test_raw %>%
pmap(function(...) names(list(...))[sapply(list(...), is.na)]) %>%
Filter(function(x) length(x) > 0, .) %>%
unique(.)
## [[1]]
## [1] "DE_NET_EXPORT" "DE_NET_IMPORT"
##
## [[2]]
## [1] "DE_FR_EXCHANGE" "FR_DE_EXCHANGE" "DE_NET_EXPORT" "FR_NET_EXPORT"
## [5] "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[3]]
## [1] "DE_NET_EXPORT" "FR_NET_EXPORT" "DE_NET_IMPORT" "FR_NET_IMPORT"
##
## [[4]]
## [1] "DE_RAIN" "FR_RAIN" "DE_WIND" "FR_WIND" "DE_TEMP" "FR_TEMP"
Apparently we get the same groups of columns.
As a side note, the analysis above shows that the test set has the same properties as the training set and thus it is a representative sample of the data.
To conclude the analysis above:
DAY_ID col have NAs in all 6 cols describing countries’
weather conditions (DE_RAIN, FR_RAIN,
DE_WIND, FR_WIND, DE_TEMP,
FR_TEMP).DAY_ID col can have NAs simultaneously in the following
groups of cols:
DE_NET_EXPORT, DE_NET_IMPORTDE_NET_EXPORT, FR_NET_EXPORT,
DE_NET_IMPORT, FR_NET_IMPORTDE_FR_EXCHANGE, FR_DE_EXCHANGE,
DE_NET_EXPORT, FR_NET_EXPORT,
DE_NET_IMPORT, FR_NET_IMPORTExcluding vars ID and DAY_ID useless for
the prediction and setting apart COUNTRY, we are left with
numeric vars only.
Pair corr matrix. I use Spearman corr to capture non-linear corrs.
res <- cor(select(X_train_raw, -ID, -DAY_ID, -COUNTRY), method = "spearman", use = "complete.obs")
res2 <- rcorr(as.matrix(select(X_train_raw, -ID, -DAY_ID, -COUNTRY)), type = "spearman")
Plot:
corrplot(
res,
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
tl.cex = 0.4)
Let’s explore in more details the correlation matrix. For the convenience we will flatten it together with the matrix of p-values.
# Code source: http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software#a-simple-function-to-format-the-correlation-matri
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
res2_flat <- flattenCorrMatrix(res2$r, res2$P) %>%
mutate(cor = round(cor, 2), p = round(100 * p, 0)) %>%
arrange(desc(abs(cor)))
res2_flat
First we notice that all correlations with an absolute value greater than 0.07 are significant with p-value < 1%.
Other observations:
DE_FR_EXCHANGE vs FR_DE_EXCHANGE,
DE_NET_EXPORT vs DE_NET_IMPORT,
FR_NET_EXPORT vs FR_NET_IMPORT. Actually one
variable in the pair is the inverse of another. The plot below shows all
pair correlations between these variables. From it , and from the
correlation matrix, we also notice that DE<->FR exchange is
strongly correlated with DE/FR net import/export. At the same time
DE_NET_IMPORT and FR_NET_IMPORT are
decorrelated. How to explain this?chart.Correlation(
select(X_train_raw, DE_FR_EXCHANGE, FR_DE_EXCHANGE, DE_NET_EXPORT, DE_NET_IMPORT, FR_NET_EXPORT, FR_NET_IMPORT),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
chart.Correlation(
select(X_train_raw, FR_DE_EXCHANGE, DE_NET_EXPORT, FR_NET_EXPORT),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relation between FR/DE export, FR <-> DE exchange and FR/DE consumption (total and without renewable).
FR_CONSUMPTION and FR_RESIDUAL_LOAD versus
DE_CONSUMPTION and DE_RESIDUAL_LOAD. Could it
be attributed to the different proportion of the renewable sources in
two countries?FR_DE_EXCHANGE and FR_CONSUMPTION.chart.Correlation(
select(X_train_raw, FR_DE_EXCHANGE, DE_NET_EXPORT, FR_NET_EXPORT, FR_CONSUMPTION, DE_CONSUMPTION, FR_RESIDUAL_LOAD, DE_RESIDUAL_LOAD),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relation between weather conditions in two countries. Rather strong correlation between temperature and wind in FR and DE, the amount of rain in two countries is essentially unrelated.
chart.Correlation(
select(X_train_raw, DE_RAIN, FR_RAIN, DE_WIND, FR_WIND, DE_TEMP, FR_TEMP),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relation between weather conditions and production of renewable energy in two countries.
x_WIND and
x_WINDPOW for both countries, as it consisted of two approx
linear relations with different slopes.chart.Correlation(
select(X_train_raw, FR_RAIN, FR_WIND, FR_TEMP, FR_SOLAR, FR_WINDPOW, FR_HYDRO),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
chart.Correlation(
select(X_train_raw, DE_RAIN, DE_WIND, DE_TEMP, DE_SOLAR, DE_WINDPOW, DE_HYDRO),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Relations between countries’ consumption rates, exchange between them and whether conditions.
No clear relationship between any of the pair of variables.
chart.Correlation(
select(X_train_raw, FR_CONSUMPTION, FR_RESIDUAL_LOAD, FR_DE_EXCHANGE, FR_NET_EXPORT, FR_RAIN, FR_WIND, FR_TEMP),
histogram=TRUE,
pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
# merged_tibble <- left_join(X_train_raw, y_train, by = "ID")
#
# merged_tibble %>%
# filter(COUNTRY == "FR") %>%
# select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
We exclude from the analysis ID, DAY_ID and
perfectly correlated vars DE_FR_EXCHANGE,
FR_NET_IMPORT, DE_NET_IMPORT.
For both countries there is no clear relationship between any of the explanatory variable and target.
For France:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == "FR") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Create a vector of explanatory variable names
explanatory_vars <- colnames(select(merged_tibble, -TARGET))
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, -TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var, y = "TARGET")) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "red", span = 0.9) +
labs(x = var, y = "Target Variable") +
ggtitle(paste("Scatter Plot of", var, "vs. Target Variable for FR"))
print(res)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 124 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 124 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 70 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 70 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
For Germany:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == "DE") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Create a vector of explanatory variable names
explanatory_vars <- colnames(select(merged_tibble, -TARGET))
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, -TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var, y = "TARGET")) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "red", span = 0.9) +
labs(x = var, y = "Target Variable") +
ggtitle(paste("Scatter Plot of", var, "vs. Target Variable for DE"))
print(res)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Both countries together:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Create a vector of explanatory variable names
explanatory_vars <- colnames(select(merged_tibble, -TARGET))
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, -TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var, y = "TARGET")) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "red", span = 0.9) +
labs(x = var, y = "Target Variable") +
ggtitle(paste("Scatter Plot of", var, "vs. Target Variable for DE"))
print(res)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 124 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 124 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 70 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 70 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 94 rows containing non-finite values (`stat_smooth()`).
## Removed 94 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
For France.
Notice the distribution of x_CONSUMPTION,
x_COAL, x_GAZ and x_NUCLEAR and
the difference in these distribs for the two countries.
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == "FR") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Generate scatter plots for each explanatory variable
for (var in colnames(merged_tibble)) {
res <- ggplot(merged_tibble, aes_string(x = var)) +
geom_histogram() +
ggtitle(paste(var, " for FR"))
print(res)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 124 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 70 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 47 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For Germany only the TARGET distribution is different, all other cols are identical to France:
# Merge the two tibbles based on a common identifier
merged_tibble <- left_join(X_train_raw, y_train, by = "ID") %>%
filter(COUNTRY == "DE") %>%
select(-ID, -DAY_ID, -COUNTRY, -DE_FR_EXCHANGE, -FR_NET_IMPORT, -DE_NET_IMPORT)
# Generate scatter plots for each explanatory variable
for (var in colnames(select(merged_tibble, TARGET))) {
res <- ggplot(merged_tibble, aes_string(x = var)) +
geom_histogram() +
ggtitle(paste(var, " for DE"))
print(res)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Fill-in NAs:
X_train <- X_train_raw %>% replace(is.na(.), 0)
X_test <- X_test_raw %>% replace(is.na(.), 0)
Drop unused cols. Mind that we use ID and
DAY_ID as explanatory vars.
X_train <- X_train %>% select(-COUNTRY)
X_test <- X_test %>% select(-COUNTRY)
Fit OLS:
lm_ <- lm(y_train$TARGET ~ ., data = X_train)
Compare with the OLS model in Py benchmlark notebook:
py_lm_ <- read.csv("../tmp/py_benchmark_model_coeffs.csv")
r_lm_ <- tibble(param = names(coef(lm_)), val = coef(lm_))
# rename Intercept
r_lm_$param[1] = "intercept"
# outer join between two tibbles
# ...
lm_join_ <- full_join(py_lm_, r_lm_, by = "param", suffix = c(".py", ".r"))
# element wise comparison of the models' coefficients
lm_join_$equal <- near(py_lm_$val, r_lm_$val)
lm_join_
Except few variables (DE_FR_EXCHANGE (different),
FR_DE_EXCHANGE (NA), DE_NET_EXPORT
(different),FR_NET_EXPORT (different),
DE_NET_IMPORT(NA), FR_NET_IMPORT (NA)), the
coefficients of the models are the same. For the coefficients that
differ, the reason is that OLS in R has automatically detected highly
correlated variables and excluded them from the model, thus some of the
coefficients are NAs and others are different.
Predict:
y_hat_train <- predict(lm_, X_train)
## Warning in predict.lm(lm_, X_train): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
y_hat_test <- predict(lm_, X_test)
## Warning in predict.lm(lm_, X_test): prediction from rank-deficient fit; attr(*,
## "non-estim") has doubtful cases
Spearman correlation for the predictions on the training set:
cor(y_train$TARGET, y_hat_train, method = "spearman")
## [1] 0.2786787
This correlation value is the same as the one of the Py benchmark model.
Export predictions on the test set:
to_export <- tibble(ID = X_test_raw$ID, TARGET = y_hat_test)
write_csv(to_export, "../data/y_hat_test.csv")